#Installing Packages:
#install.packages("GGally")
#install.packages("caret")
#install.packages("mgcv")
#install.packages("glmnet")
#install.packages("randomForest")
#install.packages("mice")
#install.packages("rmarkdown")
#install.packages("knitr")
#Import the libraries:
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(readxl) # Load readxl package
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#Data Preparation:
#Load the data:
df2 <- read_excel("/Users/hetal/Downloads/RedactedClientConstituent_File.xlsx")
head(df2)
# Check for missing values
print(colSums(is.na(df2)))
## CnBio_ID First Gift Date
## 0 27073
## Last Gift Date Largest Gift Date
## 27073 27073
## CnBio_DateAdded CnBio_DateChanged
## 0 0
## CnBio_Key_Indicator CnBio_Deceased
## 0 0
## CnBio_Title_1 CnBio_Marital_status
## 29027 24985
## City State
## 1267 1374
## Reunions attended Zip
## 0 1372
## CnAdrPrf_Type CnCnstncy_1_01_CodeLong
## 0 4882
## CnCnstncy_1_02_CodeLong CnCnstncy_1_03_CodeLong
## 37625 42088
## CnCnstncy_1_04_CodeLong CnSpPrBs_RecordImportID
## 42280 42162
## CnRelEdu_1_01_Class_of CnRelEdu_1_01_Degree
## 25089 20635
## CnRelEdu_1_02_Degree Alumni Board Member
## 39900 42259
## Married to an Alum Personal Email End
## 41713 18106
## Total Lifetime Giving Last 10 Years Giving
## 0 0
## Last 5 Years Giving (FY17-21) CnSpSpBio_ID
## 0 39885
## Alumni
## 0
#Converting the columns for easy accessibility:
colnames(df2)[colnames(df2) == "Total Lifetime Giving"] <- "Total_Lifetime_Giving"
colnames(df2)[colnames(df2) == "Last 5 Years Giving (FY17-21)"] <- "Last_5_Years_Giving"
colnames(df2)[colnames(df2) == "Last 10 Years Giving"] <- "Last_10_Years_Giving"
colnames(df2)[colnames(df2) == "Alumni Board Member"] <- "Alumni_Board_Member"
colnames(df2)[colnames(df2) == "Last Gift Date"] <- "Last_Gift_Date"
colnames(df2)[colnames(df2) == "First Gift Date"] <- "First_Gift_Date"
colnames(df2)[colnames(df2) == "Largest Gift Date"] <- "Largest_Gift_Date"
colnames(df2)[colnames(df2) == "Personal Email End"] <- "Personal_Email_End"
colnames(df2)[colnames(df2) == "Reunions attended"] <- "Reunions_attended"
colnames(df2)[colnames(df2) == "Married to an Alum"] <- "Married_to_an_Alum"
colnames(df2)[colnames(df2) == "Last Gift Date"] <- "Last_Gift_Date"
##Shape of DataFrame:
dim(df2)
## [1] 42287 31
#Finding the datatype of variables:
sapply(df2, class)
## $CnBio_ID
## [1] "numeric"
##
## $First_Gift_Date
## [1] "POSIXct" "POSIXt"
##
## $Last_Gift_Date
## [1] "POSIXct" "POSIXt"
##
## $Largest_Gift_Date
## [1] "POSIXct" "POSIXt"
##
## $CnBio_DateAdded
## [1] "POSIXct" "POSIXt"
##
## $CnBio_DateChanged
## [1] "POSIXct" "POSIXt"
##
## $CnBio_Key_Indicator
## [1] "character"
##
## $CnBio_Deceased
## [1] "character"
##
## $CnBio_Title_1
## [1] "character"
##
## $CnBio_Marital_status
## [1] "character"
##
## $City
## [1] "character"
##
## $State
## [1] "character"
##
## $Reunions_attended
## [1] "numeric"
##
## $Zip
## [1] "character"
##
## $CnAdrPrf_Type
## [1] "character"
##
## $CnCnstncy_1_01_CodeLong
## [1] "character"
##
## $CnCnstncy_1_02_CodeLong
## [1] "character"
##
## $CnCnstncy_1_03_CodeLong
## [1] "character"
##
## $CnCnstncy_1_04_CodeLong
## [1] "character"
##
## $CnSpPrBs_RecordImportID
## [1] "character"
##
## $CnRelEdu_1_01_Class_of
## [1] "numeric"
##
## $CnRelEdu_1_01_Degree
## [1] "character"
##
## $CnRelEdu_1_02_Degree
## [1] "character"
##
## $Alumni_Board_Member
## [1] "character"
##
## $Married_to_an_Alum
## [1] "character"
##
## $Personal_Email_End
## [1] "character"
##
## $Total_Lifetime_Giving
## [1] "numeric"
##
## $Last_10_Years_Giving
## [1] "numeric"
##
## $Last_5_Years_Giving
## [1] "numeric"
##
## $CnSpSpBio_ID
## [1] "numeric"
##
## $Alumni
## [1] "numeric"
#Finding the summary of dataframe:
summary(df2)
## CnBio_ID First_Gift_Date
## Min. : 4 Min. :1901-01-01 00:00:00.000
## 1st Qu.: 173994 1st Qu.:1989-10-24 00:00:00.000
## Median : 292194 Median :2003-02-17 00:00:00.000
## Mean : 24034109 Mean :1998-11-12 07:39:31.447
## 3rd Qu.: 435660 3rd Qu.:2010-10-18 00:00:00.000
## Max. :200003047 Max. :2022-03-14 00:00:00.000
## NA's :27073
## Last_Gift_Date Largest_Gift_Date
## Min. :1901-01-01 00:00:00.00 Min. :1901-01-01 00:00:00.00
## 1st Qu.:1998-03-19 00:00:00.00 1st Qu.:1995-07-31 00:00:00.00
## Median :2008-12-18 00:00:00.00 Median :2006-12-07 00:00:00.00
## Mean :2006-04-01 19:16:48.49 Mean :2003-03-17 06:09:53.45
## 3rd Qu.:2016-02-29 00:00:00.00 3rd Qu.:2013-04-30 00:00:00.00
## Max. :2022-03-15 00:00:00.00 Max. :2022-03-14 00:00:00.00
## NA's :27073 NA's :27073
## CnBio_DateAdded CnBio_DateChanged
## Min. :2017-07-17 00:00:00.00 Min. :2017-07-18 00:00:00.00
## 1st Qu.:2017-07-17 00:00:00.00 1st Qu.:2021-11-27 00:00:00.00
## Median :2017-07-17 00:00:00.00 Median :2022-02-07 00:00:00.00
## Mean :2017-10-04 02:19:28.86 Mean :2021-08-05 14:07:51.11
## 3rd Qu.:2017-07-17 00:00:00.00 3rd Qu.:2022-02-11 00:00:00.00
## Max. :2022-03-17 00:00:00.00 Max. :2022-03-18 00:00:00.00
##
## CnBio_Key_Indicator CnBio_Deceased CnBio_Title_1 CnBio_Marital_status
## Length:42287 Length:42287 Length:42287 Length:42287
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## City State Reunions_attended Zip
## Length:42287 Length:42287 Min. :0.000000 Length:42287
## Class :character Class :character 1st Qu.:0.000000 Class :character
## Mode :character Mode :character Median :0.000000 Mode :character
## Mean :0.007378
## 3rd Qu.:0.000000
## Max. :6.000000
##
## CnAdrPrf_Type CnCnstncy_1_01_CodeLong CnCnstncy_1_02_CodeLong
## Length:42287 Length:42287 Length:42287
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## CnCnstncy_1_03_CodeLong CnCnstncy_1_04_CodeLong CnSpPrBs_RecordImportID
## Length:42287 Length:42287 Length:42287
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## CnRelEdu_1_01_Class_of CnRelEdu_1_01_Degree CnRelEdu_1_02_Degree
## Min. :1929 Length:42287 Length:42287
## 1st Qu.:1985 Class :character Class :character
## Median :2004 Mode :character Mode :character
## Mean :1997
## 3rd Qu.:2014
## Max. :2021
## NA's :25089
## Alumni_Board_Member Married_to_an_Alum Personal_Email_End
## Length:42287 Length:42287 Length:42287
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Total_Lifetime_Giving Last_10_Years_Giving Last_5_Years_Giving
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 0
## Mean : 2318 Mean : 989 Mean : 456
## 3rd Qu.: 35 3rd Qu.: 0 3rd Qu.: 0
## Max. :11321975 Max. :8805800 Max. :4121000
##
## CnSpSpBio_ID Alumni
## Min. : 1512 Min. :0.0000
## 1st Qu.: 199564 1st Qu.:0.0000
## Median : 357472 Median :0.0000
## Mean : 68234755 Mean :0.4386
## 3rd Qu.:200000817 3rd Qu.:1.0000
## Max. :200003047 Max. :1.0000
## NA's :39885
#Getting the column names:
names(df2)
## [1] "CnBio_ID" "First_Gift_Date"
## [3] "Last_Gift_Date" "Largest_Gift_Date"
## [5] "CnBio_DateAdded" "CnBio_DateChanged"
## [7] "CnBio_Key_Indicator" "CnBio_Deceased"
## [9] "CnBio_Title_1" "CnBio_Marital_status"
## [11] "City" "State"
## [13] "Reunions_attended" "Zip"
## [15] "CnAdrPrf_Type" "CnCnstncy_1_01_CodeLong"
## [17] "CnCnstncy_1_02_CodeLong" "CnCnstncy_1_03_CodeLong"
## [19] "CnCnstncy_1_04_CodeLong" "CnSpPrBs_RecordImportID"
## [21] "CnRelEdu_1_01_Class_of" "CnRelEdu_1_01_Degree"
## [23] "CnRelEdu_1_02_Degree" "Alumni_Board_Member"
## [25] "Married_to_an_Alum" "Personal_Email_End"
## [27] "Total_Lifetime_Giving" "Last_10_Years_Giving"
## [29] "Last_5_Years_Giving" "CnSpSpBio_ID"
## [31] "Alumni"
#Feature Engineering:
#After analyzing the data, group all the similar features in same category using encoding method:
# create a new column with grouped values for CnCnstncy_1_01_CodeLong:
df2$grouped_CnCnstncy_1_01_CodeLong <- ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Board", "Previous Board"), "Board",
ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Current Fac/Staff", "Former Fac/Staff"), "Fac/Staff",
ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Student", "Parent", "Education Certificate"), "Education/Family",
ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Friend", "Friends / Memorial"), "Friendship/Memorial",
ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Prospective Benefactor", "Organization", "Alumni", "Business", "Foundation", "Trust / Business", "Dominican Colleges and Universities", "WAICU", "Government", "Religious Org", "Unknown - Historical"), "OtherDonors", "NA")))))
# encode the labels using factor()
# encode the labels using factor() and replace NA and 0 values with 0
df2$CnCnstncy_1_01_CodeLong_Encoded <- ifelse(is.na(df2$grouped_CnCnstncy_1_01_CodeLong) | df2$grouped_CnCnstncy_1_01_CodeLong == "NA" | df2$grouped_CnCnstncy_1_01_CodeLong == 0, 0,
as.integer(factor(df2$grouped_CnCnstncy_1_01_CodeLong,
levels = c("Education/Family", "Friendship/Memorial", "Fac/Staff", "Board", "OtherDonors"),
labels = c(1, 2, 3, 4, 5))))
#After analyzing the data, group all the similar features in same category using encoding method:
# create a new column with grouped values for CnCnstncy_1_02_CodeLong:
df2$grouped_CnCnstncy_1_02_CodeLong <- ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Board", "Previous Board", "Board of Visitors Advisory Group"), "Board",
ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Current Fac/Staff", "Former Fac/Staff"), "Fac/Staff",
ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Student", "Parent", "Education Certificate"), "Education/Family",
ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Friend", "Friends / Memorial", "Friends / Athletics", "Friends / Agency"), "Friendship/Memorial",
ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Prospective Benefactor", "Organization", "Alumni", "Business", "Foundation", "Trust", "Cutting Edge Alumni", "Religious Org", "Unknown - Historical"), "OtherDonors", "NA")))))
# encode the labels using factor()
# encode the labels using factor() and replace NA and 0 values with 0
df2$CnCnstncy_1_02_CodeLong_Encoded <- ifelse(is.na(df2$grouped_CnCnstncy_1_02_CodeLong) | df2$grouped_CnCnstncy_1_02_CodeLong == "NA" | df2$grouped_CnCnstncy_1_02_CodeLong == 0, 0,
as.integer(factor(df2$grouped_CnCnstncy_1_02_CodeLong,
levels = c("Education/Family", "Friendship/Memorial", "Fac/Staff", "Board", "OtherDonors"),
labels = c(1, 2, 3, 4, 5))))
#After analyzing the data, group all the similar features in same category using encoding method:
# create a new column with grouped values for CnCnstncy_1_03_CodeLong:
df2$grouped_CnCnstncy_1_03_CodeLong <- ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Board", "Previous Board"), "Board",
ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Current Fac/Staff", "Former Fac/Staff"), "Fac/Staff",
ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Student", "Parent", "Education Certificate"), "Education/Family",
ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Friend", "Friends / Memorial"), "Friendship/Memorial",
ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Prospective Benefactor", "Organization", "Alumni"), "OtherDonors", "NA")))))
# encode the labels using factor() and replace NA and 0 values with 0
df2$CnCnstncy_1_03_CodeLong_Encoded <- ifelse(is.na(df2$grouped_CnCnstncy_1_03_CodeLong) | df2$grouped_CnCnstncy_1_03_CodeLong == "NA" | df2$grouped_CnCnstncy_1_03_CodeLong == 0, 0,
as.integer(factor(df2$grouped_CnCnstncy_1_03_CodeLong,
levels = c("Education/Family", "Friendship/Memorial", "Fac/Staff", "Board", "OtherDonors"),
labels = c(1, 2, 3, 4, 5))))
#After analyzing the data, group all the similar features in same category using encoding method:
# Create a mapping between the original labels and new encoded labels for CnCnstncy_1_04_CodeLong:
label_mapping <- c("NA" = 0, "Former Fac/Staff" = 1, "Previous Board" = 2, "Education Certificate" = 3)
# Replace missing values with "NA" label
df2$CnCnstncy_1_04_CodeLong[is.na(df2$CnCnstncy_1_04_CodeLong)] <- "NA"
# Convert the column to a factor with the new encoding
df2$CnCnstncy_1_04_CodeLong_Encoded <- factor(df2$CnCnstncy_1_04_CodeLong, labels = label_mapping)
#After analyzing the data, group all the similar features in same category using encoding method:
# create a new column with grouped values for CnBio_Marital_status:
df2$grouped_CnBio_Marital_status <- ifelse(df2$CnBio_Marital_status %in% c("Married", "Partner", "Cohabitation", "Engaged"), "Committed",
ifelse(df2$CnBio_Marital_status %in% c("Divorced", "Single", "Widowed", "Separated"), "Single",
ifelse(df2$CnBio_Marital_status %in% c("Religious"), "Worshipper",
ifelse(is.na(df2$CnBio_Marital_status) | df2$CnBio_Marital_status == "Unknown", "Unknowns", "NA"))))
# encode the labels using factor() and replace NA and 0 values with 0
df2$CnBio_Marital_status_Encoded <- ifelse(is.na(df2$grouped_CnBio_Marital_status) | df2$grouped_CnBio_Marital_status == "NA" | df2$grouped_CnBio_Marital_status == "Unknowns", 0,
as.integer(factor(df2$grouped_CnBio_Marital_status,
levels = c("Committed", "Single", "Worshipper", "Unknowns"),
labels = c(1, 2, 3, 4))))
##Label Encoding to States Column:
## First find top 5 and least 5 from total_lifetime, 5years and 10 years and seee consistent or not
# Convert the category column to a factor
df2$State <- factor(df2$State)
# Convert the factor levels to integer values using as.integer()
df2$State_encoded <- as.integer(df2$State)
# Replace null and NA values with 0 using ifelse() and is.na()
df2$State_encoded <- ifelse(is.na(df2$State_encoded), 0, df2$State_encoded)
# View the resulting data frame
df2
#Replacing for all the values of alumni board member to 1 and 0 for not there:
df2$Alumni_Board_Member <- ifelse(!is.na(df2$Alumni_Board_Member) & df2$Alumni_Board_Member != "", 1, 0)
# Replace "O" with 0 and "l" with 1 in CnBio_Key_Deceased column:
df2$CnBio_Deceased <- ifelse(df2$CnBio_Deceased == "No", 0, 1)
# print updated data frame
print(df2)
## # A tibble: 42,287 × 41
## CnBio_ID First_Gift_Date Last_Gift_Date Largest_Gift_Date
## <dbl> <dttm> <dttm> <dttm>
## 1 200001488 NA NA NA
## 2 200001489 NA NA NA
## 3 200001490 NA NA NA
## 4 200001491 NA NA NA
## 5 200001492 NA NA NA
## 6 200001493 NA NA NA
## 7 200001494 NA NA NA
## 8 200001495 NA NA NA
## 9 200001496 NA NA NA
## 10 200001497 NA NA NA
## # ℹ 42,277 more rows
## # ℹ 37 more variables: CnBio_DateAdded <dttm>, CnBio_DateChanged <dttm>,
## # CnBio_Key_Indicator <chr>, CnBio_Deceased <dbl>, CnBio_Title_1 <chr>,
## # CnBio_Marital_status <chr>, City <chr>, State <fct>,
## # Reunions_attended <dbl>, Zip <chr>, CnAdrPrf_Type <chr>,
## # CnCnstncy_1_01_CodeLong <chr>, CnCnstncy_1_02_CodeLong <chr>,
## # CnCnstncy_1_03_CodeLong <chr>, CnCnstncy_1_04_CodeLong <chr>, …
# replace "O" with 0 and "l" with 1 in CnBio_Key_Indicator column
df2$CnBio_Key_Indicator <- ifelse(df2$CnBio_Key_Indicator == "O", 0, 1)
## Any Degrees Present in Education:
df2$Any_Degree_Present <- ifelse((is.na(df2$CnRelEdu_1_01_Degree) & is.na(df2$CnRelEdu_1_02_Degree)) |
(df2$CnRelEdu_1_01_Degree %in% c("None", "Unknown")) |
(df2$CnRelEdu_1_02_Degree %in% c("None", "Unknown")), 0,
ifelse(!is.na(df2$CnRelEdu_1_01_Degree) & !is.na(df2$CnRelEdu_1_02_Degree), 2,1))
# Extract year and month from datetime
df2$First_Gift_Year <- year(df2$First_Gift_Date)
df2$First_Gift_Month <- month(df2$First_Gift_Date)
#Data Cleaning:
# Remove the rows with 0 in column 'Total_Lifetime_Giving' which is the target variable to remove distortion in the analysis:
df2 <- df2[df2$Total_Lifetime_Giving != 0, ]
# print new dataframe
print(df2)
## # A tibble: 15,116 × 44
## CnBio_ID First_Gift_Date Last_Gift_Date Largest_Gift_Date
## <dbl> <dttm> <dttm> <dttm>
## 1 100001361 1973-02-01 00:00:00 2021-12-27 00:00:00 2021-12-27 00:00:00
## 2 100001363 1986-08-11 00:00:00 2014-06-03 00:00:00 1986-08-11 00:00:00
## 3 100001364 1971-12-26 00:00:00 2008-11-10 00:00:00 1987-11-19 00:00:00
## 4 100001366 1977-09-17 00:00:00 2018-04-22 00:00:00 2016-04-24 00:00:00
## 5 100001368 1985-06-10 00:00:00 1996-11-06 00:00:00 1985-06-10 00:00:00
## 6 100001373 1977-05-31 00:00:00 2020-03-16 00:00:00 2006-11-14 00:00:00
## 7 100001376 1977-05-24 00:00:00 2021-11-24 00:00:00 2021-11-24 00:00:00
## 8 100001378 1968-10-05 00:00:00 2020-06-08 00:00:00 1991-03-13 00:00:00
## 9 341286 2009-05-11 00:00:00 2011-11-11 00:00:00 2011-11-11 00:00:00
## 10 337744 2020-05-16 00:00:00 2020-05-16 00:00:00 2020-05-16 00:00:00
## # ℹ 15,106 more rows
## # ℹ 40 more variables: CnBio_DateAdded <dttm>, CnBio_DateChanged <dttm>,
## # CnBio_Key_Indicator <dbl>, CnBio_Deceased <dbl>, CnBio_Title_1 <chr>,
## # CnBio_Marital_status <chr>, City <chr>, State <fct>,
## # Reunions_attended <dbl>, Zip <chr>, CnAdrPrf_Type <chr>,
## # CnCnstncy_1_01_CodeLong <chr>, CnCnstncy_1_02_CodeLong <chr>,
## # CnCnstncy_1_03_CodeLong <chr>, CnCnstncy_1_04_CodeLong <chr>, …
#Dropping the irrelevant columns for the donation purpose:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df2 <- df2 %>% select(-CnSpSpBio_ID,-Personal_Email_End,-Largest_Gift_Date,-Zip,-CnSpPrBs_RecordImportID,-CnBio_Title_1,-City,-Last_Gift_Date,-Married_to_an_Alum,-CnRelEdu_1_01_Class_of)
#library(dplyr)
#df2 <- df2 %>% select(-CnRelEdu_1_01_Class_of)
#Remove the maximum value row from Total_Lifetime_Giving
max_row <- which.max(df2$Total_Lifetime_Giving)
df2 <- df2[-max_row, ]
#Taking only numeric variables:
numeric_df <- subset(df2, select = which(sapply(df2, is.numeric)))
# compute the correlation matrix between all pairs of numeric columns
print(cor(numeric_df))
## CnBio_ID CnBio_Key_Indicator CnBio_Deceased
## CnBio_ID 1.000000000 -0.02707257 0.173067828
## CnBio_Key_Indicator -0.027072570 1.00000000 0.157417708
## CnBio_Deceased 0.173067828 0.15741771 1.000000000
## Reunions_attended 0.011722215 0.03607486 -0.037181403
## Alumni_Board_Member -0.018515463 0.01404689 -0.015135426
## Total_Lifetime_Giving 0.010025236 -0.06476337 -0.005745462
## Last_10_Years_Giving -0.000172846 -0.08877176 -0.025284441
## Last_5_Years_Giving 0.021639108 -0.05990592 -0.023797930
## Alumni -0.075115148 0.24895521 -0.017145338
## CnCnstncy_1_01_CodeLong_Encoded -0.039611696 -0.27438305 -0.043194507
## CnCnstncy_1_02_CodeLong_Encoded -0.016985372 0.03112182 -0.018741522
## CnCnstncy_1_03_CodeLong_Encoded -0.027017278 0.02799232 -0.008792243
## CnBio_Marital_status_Encoded -0.060076252 0.26824803 0.072669025
## State_encoded -0.134263327 -0.02439766 -0.094301389
## Any_Degree_Present -0.105045861 0.23137718 -0.058129480
## First_Gift_Year -0.136690635 -0.01927302 -0.296002621
## First_Gift_Month 0.036036393 -0.03994877 -0.052067518
## Reunions_attended Alumni_Board_Member
## CnBio_ID 0.011722215 -0.0185154632
## CnBio_Key_Indicator 0.036074863 0.0140468939
## CnBio_Deceased -0.037181403 -0.0151354262
## Reunions_attended 1.000000000 0.0748671877
## Alumni_Board_Member 0.074867188 1.0000000000
## Total_Lifetime_Giving 0.003519344 0.0008028766
## Last_10_Years_Giving 0.014176013 0.0020996682
## Last_5_Years_Giving 0.026596257 0.0059125106
## Alumni 0.079893332 0.0564233770
## CnCnstncy_1_01_CodeLong_Encoded 0.076210996 0.0432673310
## CnCnstncy_1_02_CodeLong_Encoded 0.100902301 0.0179823034
## CnCnstncy_1_03_CodeLong_Encoded 0.159452322 0.0200510561
## CnBio_Marital_status_Encoded 0.100662864 0.0297046868
## State_encoded 0.011511602 0.0157708281
## Any_Degree_Present 0.074956572 0.0740118917
## First_Gift_Year -0.142611822 -0.0108378874
## First_Gift_Month -0.028161465 0.0054782915
## Total_Lifetime_Giving Last_10_Years_Giving
## CnBio_ID 0.0100252364 -1.728460e-04
## CnBio_Key_Indicator -0.0647633747 -8.877176e-02
## CnBio_Deceased -0.0057454619 -2.528444e-02
## Reunions_attended 0.0035193443 1.417601e-02
## Alumni_Board_Member 0.0008028766 2.099668e-03
## Total_Lifetime_Giving 1.0000000000 5.920496e-01
## Last_10_Years_Giving 0.5920495964 1.000000e+00
## Last_5_Years_Giving 0.4190867332 6.422503e-01
## Alumni -0.0230186643 -3.200903e-02
## CnCnstncy_1_01_CodeLong_Encoded 0.0154420560 1.550557e-02
## CnCnstncy_1_02_CodeLong_Encoded 0.0297843131 2.197695e-02
## CnCnstncy_1_03_CodeLong_Encoded 0.0713673700 1.404653e-03
## CnBio_Marital_status_Encoded 0.0106121228 3.156821e-05
## State_encoded 0.0040689666 -3.191102e-03
## Any_Degree_Present -0.0155931585 -2.995966e-02
## First_Gift_Year -0.0276762147 9.896727e-03
## First_Gift_Month 0.0113455100 2.590825e-02
## Last_5_Years_Giving Alumni
## CnBio_ID 0.021639108 -0.07511515
## CnBio_Key_Indicator -0.059905923 0.24895521
## CnBio_Deceased -0.023797930 -0.01714534
## Reunions_attended 0.026596257 0.07989333
## Alumni_Board_Member 0.005912511 0.05642338
## Total_Lifetime_Giving 0.419086733 -0.02301866
## Last_10_Years_Giving 0.642250279 -0.03200903
## Last_5_Years_Giving 1.000000000 -0.02264843
## Alumni -0.022648431 1.00000000
## CnCnstncy_1_01_CodeLong_Encoded 0.008444149 0.76569753
## CnCnstncy_1_02_CodeLong_Encoded 0.026330669 0.32290206
## CnCnstncy_1_03_CodeLong_Encoded -0.002389547 0.08729421
## CnBio_Marital_status_Encoded 0.003896422 0.39861807
## State_encoded -0.004823998 -0.04192007
## Any_Degree_Present -0.021808978 0.89129643
## First_Gift_Year 0.006392624 -0.14787781
## First_Gift_Month 0.021706413 -0.02750813
## CnCnstncy_1_01_CodeLong_Encoded
## CnBio_ID -0.039611696
## CnBio_Key_Indicator -0.274383052
## CnBio_Deceased -0.043194507
## Reunions_attended 0.076210996
## Alumni_Board_Member 0.043267331
## Total_Lifetime_Giving 0.015442056
## Last_10_Years_Giving 0.015505574
## Last_5_Years_Giving 0.008444149
## Alumni 0.765697533
## CnCnstncy_1_01_CodeLong_Encoded 1.000000000
## CnCnstncy_1_02_CodeLong_Encoded 0.282285411
## CnCnstncy_1_03_CodeLong_Encoded 0.069207481
## CnBio_Marital_status_Encoded 0.234442371
## State_encoded -0.017327307
## Any_Degree_Present 0.678909063
## First_Gift_Year -0.224824614
## First_Gift_Month -0.015940561
## CnCnstncy_1_02_CodeLong_Encoded
## CnBio_ID -0.01698537
## CnBio_Key_Indicator 0.03112182
## CnBio_Deceased -0.01874152
## Reunions_attended 0.10090230
## Alumni_Board_Member 0.01798230
## Total_Lifetime_Giving 0.02978431
## Last_10_Years_Giving 0.02197695
## Last_5_Years_Giving 0.02633067
## Alumni 0.32290206
## CnCnstncy_1_01_CodeLong_Encoded 0.28228541
## CnCnstncy_1_02_CodeLong_Encoded 1.00000000
## CnCnstncy_1_03_CodeLong_Encoded 0.22385491
## CnBio_Marital_status_Encoded 0.18261191
## State_encoded 0.02382638
## Any_Degree_Present 0.30161287
## First_Gift_Year -0.10473342
## First_Gift_Month -0.01918667
## CnCnstncy_1_03_CodeLong_Encoded
## CnBio_ID -0.027017278
## CnBio_Key_Indicator 0.027992322
## CnBio_Deceased -0.008792243
## Reunions_attended 0.159452322
## Alumni_Board_Member 0.020051056
## Total_Lifetime_Giving 0.071367370
## Last_10_Years_Giving 0.001404653
## Last_5_Years_Giving -0.002389547
## Alumni 0.087294213
## CnCnstncy_1_01_CodeLong_Encoded 0.069207481
## CnCnstncy_1_02_CodeLong_Encoded 0.223854913
## CnCnstncy_1_03_CodeLong_Encoded 1.000000000
## CnBio_Marital_status_Encoded 0.072242463
## State_encoded 0.024987907
## Any_Degree_Present 0.099180314
## First_Gift_Year -0.058744451
## First_Gift_Month -0.011761106
## CnBio_Marital_status_Encoded State_encoded
## CnBio_ID -6.007625e-02 -0.134263327
## CnBio_Key_Indicator 2.682480e-01 -0.024397660
## CnBio_Deceased 7.266903e-02 -0.094301389
## Reunions_attended 1.006629e-01 0.011511602
## Alumni_Board_Member 2.970469e-02 0.015770828
## Total_Lifetime_Giving 1.061212e-02 0.004068967
## Last_10_Years_Giving 3.156821e-05 -0.003191102
## Last_5_Years_Giving 3.896422e-03 -0.004823998
## Alumni 3.986181e-01 -0.041920074
## CnCnstncy_1_01_CodeLong_Encoded 2.344424e-01 -0.017327307
## CnCnstncy_1_02_CodeLong_Encoded 1.826119e-01 0.023826376
## CnCnstncy_1_03_CodeLong_Encoded 7.224246e-02 0.024987907
## CnBio_Marital_status_Encoded 1.000000e+00 0.017226068
## State_encoded 1.722607e-02 1.000000000
## Any_Degree_Present 3.614488e-01 -0.012859653
## First_Gift_Year -1.903332e-01 0.079489176
## First_Gift_Month -3.643123e-02 0.028969371
## Any_Degree_Present First_Gift_Year
## CnBio_ID -0.10504586 -0.136690635
## CnBio_Key_Indicator 0.23137718 -0.019273023
## CnBio_Deceased -0.05812948 -0.296002621
## Reunions_attended 0.07495657 -0.142611822
## Alumni_Board_Member 0.07401189 -0.010837887
## Total_Lifetime_Giving -0.01559316 -0.027676215
## Last_10_Years_Giving -0.02995966 0.009896727
## Last_5_Years_Giving -0.02180898 0.006392624
## Alumni 0.89129643 -0.147877814
## CnCnstncy_1_01_CodeLong_Encoded 0.67890906 -0.224824614
## CnCnstncy_1_02_CodeLong_Encoded 0.30161287 -0.104733420
## CnCnstncy_1_03_CodeLong_Encoded 0.09918031 -0.058744451
## CnBio_Marital_status_Encoded 0.36144880 -0.190333220
## State_encoded -0.01285965 0.079489176
## Any_Degree_Present 1.00000000 -0.118145454
## First_Gift_Year -0.11814545 1.000000000
## First_Gift_Month -0.02140397 0.180048917
## First_Gift_Month
## CnBio_ID 0.036036393
## CnBio_Key_Indicator -0.039948769
## CnBio_Deceased -0.052067518
## Reunions_attended -0.028161465
## Alumni_Board_Member 0.005478291
## Total_Lifetime_Giving 0.011345510
## Last_10_Years_Giving 0.025908253
## Last_5_Years_Giving 0.021706413
## Alumni -0.027508133
## CnCnstncy_1_01_CodeLong_Encoded -0.015940561
## CnCnstncy_1_02_CodeLong_Encoded -0.019186667
## CnCnstncy_1_03_CodeLong_Encoded -0.011761106
## CnBio_Marital_status_Encoded -0.036431225
## State_encoded 0.028969371
## Any_Degree_Present -0.021403968
## First_Gift_Year 0.180048917
## First_Gift_Month 1.000000000
cor_matrix <- cor(numeric_df)
# create a heatmap of the correlation matrix
heatmap(cor_matrix,
Rowv = NA, Colv = NA, # turn off row and column dendrograms
symm = TRUE, # use symmetric color scale
margins = c(10, 10))
#Exploratory Data Analysis:
##Graph for :CnCnstncy_1_01_CodeLong_Encoded:
# Group the data by CnCnstncy_1_01_CodeLong and calculate the total count
grouped_df <- df2 %>%
group_by(CnCnstncy_1_01_CodeLong) %>%
summarize(total_count = n())
# Create the pie chart
ggplot(grouped_df, aes(x = "", y = total_count, fill = CnCnstncy_1_01_CodeLong)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
labs(x = NULL, y = NULL, fill = "CnCnstncy_1_01_CodeLong",
title = "Distribution of CnCnstncy_1_01_CodeLong") +
theme_void()
##Graph for :CnCnstncy_1_02_CodeLong_Encoded:
# Group the data by CnCnstncy_1_02_CodeLong and calculate the total count
grouped_df <- df2 %>%
group_by(CnCnstncy_1_02_CodeLong) %>%
summarize(total_count = n())
# Create the pie chart
ggplot(grouped_df, aes(x = "", y = total_count, fill = CnCnstncy_1_02_CodeLong)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
labs(x = NULL, y = NULL, fill = "CnCnstncy_1_02_CodeLong",
title = "Distribution of CnCnstncy_1_02_CodeLong") +
theme_void()
# create scatter plot
ggplot(df2, aes(x = df2$First_Gift_Year, y = df2$Total_Lifetime_Giving)) +
geom_point() +
scale_y_continuous(limits = c(0, 110000)) +
labs(x = "First_Gift_Year Axis Label", y = "Total_Lifetime_Giving", title = "Scatter Plot Title")
## Warning: Use of `df2$First_Gift_Year` is discouraged.
## ℹ Use `First_Gift_Year` instead.
## Warning: Use of `df2$Total_Lifetime_Giving` is discouraged.
## ℹ Use `Total_Lifetime_Giving` instead.
## Warning: Removed 131 rows containing missing values (`geom_point()`).
# Count the frequency of each value
freq <- table(df2$Alumni)
# Create a dataframe with the counts and percentages
df_freq <- data.frame(value = as.numeric(freq),
percentage = round(as.numeric(freq) / sum(as.numeric(freq)) * 100, 1))
# Create the pie chart with percentage labels
ggplot(data = df_freq, aes(x = "", y = value, fill = factor(value))) +
geom_bar(width = 1, stat = "identity") +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(percentage, "%")),
position = position_stack(vjust = 0.5)) +
labs(title = "Alumni Member Distribution") +
scale_fill_manual(values = c("Pink", "Yellow"),
labels = c("Non-Alumni Member", "Alumni Board Member")) +
theme_void()
# Group the data by year and calculate the total lifetime giving for each year
total_giving_by_year <- df2 %>%
group_by(First_Gift_Year) %>%
summarize(Total_Lifetime_Giving = sum(Total_Lifetime_Giving))
ggplot(total_giving_by_year, aes(x = First_Gift_Year, y = Total_Lifetime_Giving)) +
geom_line(color = "purple") +
labs(x = "First Gift Year", y = "Total Lifetime Giving", title = "First Gift Year vs Total Lifetime Giving")
##Dropping the rows as the values in The First Gift Year from 1900 to 1940:
library(dplyr)
numeric_df <- numeric_df %>%
filter(First_Gift_Year > 1940)
# Group the data by year and calculate the total lifetime giving for each year
total_giving_by_month <- numeric_df %>%
group_by(First_Gift_Month) %>%
summarize(Total_Lifetime_Giving = sum(Total_Lifetime_Giving))
ggplot(total_giving_by_month, aes(x = First_Gift_Month, y = Total_Lifetime_Giving)) +
geom_line() +
labs(x = "First Gift Month", y = "Total Lifetime Giving", title = "First Gift Month vs Total Lifetime Giving")
library(ggplot2)
##1
# Group the data by Year and Alumni and calculate the total lifetime giving
grouped_df <- df2 %>%
group_by(First_Gift_Year, Alumni) %>%
summarize(total_lifetime_giving = sum(Total_Lifetime_Giving))
## `summarise()` has grouped output by 'First_Gift_Year'. You can override using
## the `.groups` argument.
# Create the bubble chart
ggplot(grouped_df, aes(x = First_Gift_Year ,y = total_lifetime_giving, size = total_lifetime_giving, color = Alumni)) +
geom_point() +
scale_size(range = c(1, 10)) +
labs(x = "Year", y = "Total Lifetime Giving",
title = "Bubble chart of Total Lifetime Giving by Year and Alumni")
##2
# Group the data by Year and Any_Degree_Present and calculate the total lifetime giving
grouped_df <- df2 %>%
group_by(First_Gift_Year,Any_Degree_Present ) %>%
summarize(total_lifetime_giving = sum(Total_Lifetime_Giving))
## `summarise()` has grouped output by 'First_Gift_Year'. You can override using
## the `.groups` argument.
# Create the bubble chart
ggplot(grouped_df, aes(x = First_Gift_Year ,y = total_lifetime_giving, size = total_lifetime_giving, color = Any_Degree_Present)) +
geom_point() +
scale_size(range = c(1, 10)) +
labs(x = "Year", y = "Total Lifetime Giving",
title = "Bubble chart of Total Lifetime Giving by Year and Alumni Board Member")
##1
# create the linear graph
ggplot(df2, aes(CnCnstncy_1_01_CodeLong_Encoded, Total_Lifetime_Giving)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "CnCnstncy_1_01_CodeLong_Encoded", y = "Total_Lifetime_Giving", title = "Linear Graph Title")
## `geom_smooth()` using formula = 'y ~ x'
##2
# create the linear graph
ggplot(df2, aes(First_Gift_Year, Total_Lifetime_Giving)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "First_Gift_Year", y = "Total_Lifetime_Giving", title = "Linear Graph Title")
## `geom_smooth()` using formula = 'y ~ x'
##3
# create the linear graph
ggplot(df2, aes(Last_10_Years_Giving, Total_Lifetime_Giving)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Last_10_Years_Giving", y = "Total_Lifetime_Giving", title = "Linear Graph Title")
## `geom_smooth()` using formula = 'y ~ x'
library(dplyr) # load the dplyr package for data manipulation
# sort the data frame by Total_Lifetime_Giving in descending order, and select the top 10 states
topstates <- df2 %>%
arrange(desc(Total_Lifetime_Giving)) %>%
slice(1:40) %>%
pull(State)
# create a scatter plot for the top states
ggplot(data = filter(df2, State %in% topstates),
aes(x = State, y = Total_Lifetime_Giving)) +
geom_point() +
labs(x = "State", y = "Total Lifetime Giving") +
ylim(0, 1200000) # adjust the y-axis limits to accommodate the highest value
## Warning: Removed 7 rows containing missing values (`geom_point()`).
library(ggplot2)
library(dplyr)
##Topwith Total
df_top10 <- df2 %>%
group_by(State) %>%
summarise(Total_Lifetime_Giving = sum(Total_Lifetime_Giving)) %>%
filter(State != "N/A") %>%
arrange(desc(Total_Lifetime_Giving)) %>%
head(10)
ggplot(df_top10, aes(x = reorder(State, Total_Lifetime_Giving), y = Total_Lifetime_Giving)) +
geom_bar(stat = "identity", fill = "blue") +
ggtitle("Top 10 Highest States by Total Lifetime Giving") +
xlab("State") +
ylab("Total Lifetime Giving") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
##bottom with total
df_bottom10 <- df2 %>%
group_by(State) %>%
summarise(Total_Lifetime_Giving = sum(Total_Lifetime_Giving)) %>%
filter(State != "N/A") %>%
arrange(Total_Lifetime_Giving) %>%
head(10)
ggplot(df_bottom10, aes(x = reorder(State, Total_Lifetime_Giving), y = Total_Lifetime_Giving)) +
geom_bar(stat = "identity", fill = "pink") +
ggtitle("Top 10 Lowest States by Total Lifetime Giving") +
xlab("State") +
ylab("Total Lifetime Giving") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#Top with 5years
df_top10 <- df2 %>%
group_by(State) %>%
summarise(Last_5_Years_Giving = sum(Last_5_Years_Giving)) %>%
filter(State != "N/A") %>%
arrange(desc(Last_5_Years_Giving)) %>%
head(10)
ggplot(df_top10, aes(x = reorder(State, Last_5_Years_Giving), y = Last_5_Years_Giving)) +
geom_bar(stat = "identity", fill = "blue") +
ggtitle("Top 10 Highest States by Last_5_Years_Giving") +
xlab("State") +
ylab("Last_5_Years_Giving") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#bottom with the 5 years
df_bottom10 <- df2 %>%
group_by(State) %>%
summarise(Last_5_Years_Giving = sum(Last_5_Years_Giving)) %>%
filter(State != "N/A") %>%
arrange(Last_5_Years_Giving) %>%
head(10)
ggplot(df_bottom10, aes(x = reorder(State, Last_5_Years_Giving), y = Last_5_Years_Giving)) +
geom_bar(stat = "identity", fill = "pink") +
ggtitle("Top 10 Lowest States by Last_5_Years_Giving") +
xlab("State") +
ylab("Last_5_Years_Giving") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#top with 10 years
df_top10 <- df2 %>%
group_by(State) %>%
summarise(Last_10_Years_Giving = sum(Last_10_Years_Giving)) %>%
filter(State != "N/A") %>%
arrange(desc(Last_10_Years_Giving)) %>%
head(10)
ggplot(df_top10, aes(x = reorder(State, Last_10_Years_Giving), y = Last_10_Years_Giving)) +
geom_bar(stat = "identity", fill = "blue") +
ggtitle("Top 10 Highest States by Last_10_Years_Giving") +
xlab("State") +
ylab("Last_10_Years_Giving") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#Bottom 10 with Last 10 years
df_bottom10 <- df2 %>%
group_by(State) %>%
summarise(Last_10_Years_Giving = sum(Last_10_Years_Giving)) %>%
filter(State != "N/A") %>%
arrange(Last_10_Years_Giving) %>%
head(10)
ggplot(df_bottom10, aes(x = reorder(State, Last_10_Years_Giving), y = Last_10_Years_Giving)) +
geom_bar(stat = "identity", fill = "pink") +
ggtitle("Top 10 Lowest States by Last_10_Years_Giving") +
xlab("State") +
ylab("Last_10_Years_Giving") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#1
ggplot(data=numeric_df, aes(x=Total_Lifetime_Giving)) +
geom_histogram(binwidth=100000, fill= "orange") +
xlim(10000, 1000000)
## Warning: Removed 14326 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
#2
ggplot(data=numeric_df, aes(x=Total_Lifetime_Giving)) +
geom_histogram(binwidth=10000, fill="darkgreen") +
xlim(10000, 260000)
## Warning: Removed 14375 rows containing non-finite values (`stat_bin()`).
## Removed 2 rows containing missing values (`geom_bar()`).
#3
ggplot(data=numeric_df, aes(x=Total_Lifetime_Giving)) +
geom_histogram(binwidth=100000, fill="darkblue") +
xlim(100000, 7000000)
## Warning: Removed 14790 rows containing non-finite values (`stat_bin()`).
## Removed 2 rows containing missing values (`geom_bar()`).
df=numeric_df
dim(df)
## [1] 14927 17
df$log_Total_Lifetime_Giving <- log(df$Total_Lifetime_Giving)
# count missing values
colSums(is.na(numeric_df))
## CnBio_ID CnBio_Key_Indicator
## 0 0
## CnBio_Deceased Reunions_attended
## 0 0
## Alumni_Board_Member Total_Lifetime_Giving
## 0 0
## Last_10_Years_Giving Last_5_Years_Giving
## 0 0
## Alumni CnCnstncy_1_01_CodeLong_Encoded
## 0 0
## CnCnstncy_1_02_CodeLong_Encoded CnCnstncy_1_03_CodeLong_Encoded
## 0 0
## CnBio_Marital_status_Encoded State_encoded
## 0 0
## Any_Degree_Present First_Gift_Year
## 0 0
## First_Gift_Month
## 0
library(dplyr)
#numeric_df <- numeric_df %>% select(-CnRelEdu_1_01_Class_of)
#numeric_df <- numeric_df %>% select(-CnSpSpBio_ID)
# Build a linear regression model
model <- lm(Total_Lifetime_Giving ~ ., data = numeric_df)
# Get the fitted values and residuals
fitted <- predict(model)
residuals <- residuals(model)
# Create a data frame with the fitted values and residuals
plot_df <- data.frame(Fitted_Values = fitted, Residuals = residuals)
# Plot the residuals against the fitted values
ggplot(plot_df, aes(Fitted_Values, Residuals)) +
geom_point() +
geom_smooth(se = FALSE) +
labs(x = "Fitted Values", y = "Residuals", title = "Residual Plot")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
##When conducting a residual analysis, a "residuals versus fits plot" is the most frequently created plot. It is a scatter plot of residuals on the y axis and fitted values (estimated responses) on the x axis. The plot is used to detect non-linearity.
#Model Evaluation:
#Finding the optimal features for analysis:
## Lasso find the optimal variables:
# Load the glmnet package
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
# Convert the data to matrix format
x <- as.matrix(numeric_df[, -which(names(numeric_df) == "Total_Lifetime_Giving")])
y <- numeric_df$Total_Lifetime_Giving
# Perform LASSO regularization
lasso <- glmnet(x, y, alpha = 1)
# Plot the LASSO regularization path
plot(lasso, xvar = "lambda", label = TRUE,width = 25, height = 20)
## Warning in plot.window(...): "width" is not a graphical parameter
## Warning in plot.window(...): "height" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "width" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "height" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "width" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "height" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "width" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "height" is not a
## graphical parameter
## Warning in box(...): "width" is not a graphical parameter
## Warning in box(...): "height" is not a graphical parameter
## Warning in title(...): "width" is not a graphical parameter
## Warning in title(...): "height" is not a graphical parameter
# Choose the optimal lambda value using cross-validation
cv.lasso <- cv.glmnet(x, y, alpha = 1)
lambda <- cv.lasso$lambda.min
# Extract the coefficients for the optimal lambda value
lasso.coef <- coef(lasso, s = lambda)
lasso.coef <- lasso.coef[-1, ] # Exclude the intercept term
# Identify the most important features
lasso.features <- names(numeric_df)[-which(names(numeric_df) == "Total_Lifetime_Giving")]
lasso.features <- lasso.features[which(lasso.coef != 0)]
plot(lasso, xvar = "lambda", label = TRUE, width = 25, height = 20)
## Warning in plot.window(...): "width" is not a graphical parameter
## Warning in plot.window(...): "height" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "width" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "height" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "width" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "height" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "width" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "height" is not a
## graphical parameter
## Warning in box(...): "width" is not a graphical parameter
## Warning in box(...): "height" is not a graphical parameter
## Warning in title(...): "width" is not a graphical parameter
## Warning in title(...): "height" is not a graphical parameter
#Analyzing with Linear Regression with Train-Test and Validating the data:
##On Training test and validation for linear regression:
library(caret)
## Loading required package: lattice
# Split the data into training/validation and testing sets
set.seed(123)
trainIndex <- createDataPartition(numeric_df$Total_Lifetime_Giving, p = 0.75, list = FALSE)
train_val <- numeric_df[trainIndex, ]
test <- numeric_df[-trainIndex, ]
# Split the training/validation set into a training set and a validation set
trainIndex2 <- createDataPartition(train_val$Total_Lifetime_Giving, p = 0.7, list = FALSE)
train <- train_val[trainIndex2, ]
validation <- train_val[-trainIndex2, ]
# Fit the model using the training set
model <- lm(Total_Lifetime_Giving ~ CnCnstncy_1_02_CodeLong_Encoded + CnBio_Key_Indicator+ CnCnstncy_1_03_CodeLong_Encoded+Last_10_Years_Giving+Reunions_attended+Last_5_Years_Giving+CnCnstncy_1_01_CodeLong_Encoded,
data = train)
# Use the model to predict the testing set
predictions_test <- predict(model, newdata = test)
# Calculate the R-squared value of the model on the validation set
rsq_val <- summary(model)$r.squared
print(rsq_val)
## [1] 0.6009925
# Calculate the R-squared value of the model on the testing set
rsq_test <- 1 - sum((test$Total_Lifetime_Giving - predictions_test)^2) / sum((test$Total_Lifetime_Giving - mean(test$Total_Lifetime_Giving))^2)
print(rsq_test)
## [1] 0.1075281
#Analyzing with Ridge Regression with Train-Test and Validating the data:
library(glmnet)
# Split Ridge the data into training, validation, and testing sets
set.seed(123)
train_index <- sample(1:nrow(numeric_df), 0.7 * nrow(numeric_df))
val_index <- sample(setdiff(1:nrow(numeric_df), train_index), 0.2 * nrow(numeric_df))
test_index <- setdiff(setdiff(1:nrow(numeric_df), train_index), val_index)
train <- numeric_df[train_index, ]
val <- numeric_df[val_index, ]
test <- numeric_df[test_index, ]
# Prepare the data for modeling
y_train <- train$Total_Lifetime_Giving
y_val <- val$Total_Lifetime_Giving
y_test <- test$Total_Lifetime_Giving
x_train <- data.matrix(train[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_val <- data.matrix(val[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_test <- data.matrix(test[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_val <- data.matrix(val[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
# Fit the Ridge regression model using cross-validation on the training set
cv_model <- cv.glmnet(x_train, y_train, alpha = 0, lambda = seq(0.001, 1, length = 100))
# Extract the best lambda value using the validation set
best_lambda <- cv_model$lambda.min
# Fit the Ridge regression model with the best lambda value using the training set
best_model <- glmnet(x_train, y_train, alpha = 0, lambda = best_lambda)
# Use the fitted model to make predictions on the test set
y_predicted <- predict(best_model, s = best_lambda, newx = x_test)
# Find SST and SSE
sst <- sum((y_test - mean(y_train))^2)
sse <- sum((y_predicted - y_test)^2)
# Find R-squared
rsq <- 1 - sse/sst
rsq
## [1] 0.05516379
#Analyzing with Lasso Regression with Train-Test and Validating the data:
##lasso with validation:
# Load necessary libraries
library(caret)
library(glmnet)
# Split data into training, validation, and test sets
set.seed(123)
train_index <- createDataPartition(numeric_df$Total_Lifetime_Giving, p = 0.7, list = FALSE)
train_data <- numeric_df[train_index, ]
valid_test_data <- numeric_df[-train_index, ]
valid_index <- createDataPartition(valid_test_data$Total_Lifetime_Giving, p = 0.5, list = FALSE)
valid_data <- valid_test_data[valid_index, ]
test_data <- valid_test_data[-valid_index, ]
# Define response variable
y_train <- train_data$Total_Lifetime_Giving
y_valid <- valid_data$Total_Lifetime_Giving
y_test <- test_data$Total_Lifetime_Giving
# Define matrix of predictor variables
x_train <- data.matrix(train_data[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Last_10_Years_Giving','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_valid <- data.matrix(valid_data[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Last_10_Years_Giving','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_test <- data.matrix(test_data[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Last_10_Years_Giving','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
# Fit Lasso regression model using cross-validation on the training set
cv_model <- cv.glmnet(x_train, y_train, alpha = 1, lambda = seq(0.001, 1, length = 100))
# Extract the best lambda value
best_lambda <- cv_model$lambda.min
# Fit Lasso regression model with best lambda value using the combined training and validation set
best_model <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda)
y_valid_predicted <- predict(best_model, s = best_lambda, newx = x_valid)
# Make predictions on test set using the fitted model
y_test_predicted <- predict(best_model, s = best_lambda, newx = x_test)
# Find SST and SSE for test set
sst_test <- sum((y_test - mean(y_test))^2)
sse_test <- sum((y_test_predicted - y_test)^2)
# Find R-Squared for test set
rsq_test <- 1 - sse_test/sst_test
rsq_test
## [1] 0.05918001
#Analyzing with Decision tree with Train-Test and Validating the data:
##validation for decision_tree
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(rpart)
# Define response variable
y <- numeric_df$Total_Lifetime_Giving
# Define matrix of predictor variables
x <- data.matrix(numeric_df[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
# Split the data into training, validation, and testing sets
set.seed(123)
n <- nrow(numeric_df)
train_index <- sample(n, 0.75 * n)
valid_index <- sample(setdiff(1:n, train_index), 0.2 * n)
test_index <- setdiff(setdiff(1:n, train_index), valid_index)
x_train <- x[train_index, ]
y_train <- y[train_index]
x_valid <- x[valid_index, ]
y_valid <- y[valid_index]
x_test <- x[test_index, ]
y_test <- y[test_index]
# Fit decision tree regression model on training set
model <- rpart(y_train ~ ., data = data.frame(x_train, y_train), method = "anova")
# Use fitted model to make predictions on validation set
y_predicted_valid <- predict(model, newdata = data.frame(x_valid))
# Calculate RMSE and R-squared on validation set
rmse_valid <- sqrt(mean((y_predicted_valid - y_valid)^2))
sst_valid <- sum((y_valid - mean(y_valid))^2)
sse_valid <- sum((y_predicted_valid - y_valid)^2)
rsq_valid <- 1 - sse_valid/sst_valid
# Use fitted model to make predictions on test set
y_predicted_test <- predict(model, newdata = data.frame(x_test), type = "vector")
# Calculate R-squared on test set
sst_test <- sum((y_test - mean(y_test))^2)
sse_test <- sum((y_predicted_test - y_test)^2)
rsq_test <- 1 - sse_test/sst_test
# Print the R-squared values for test sets
cat("Test set:\n")
## Test set:
cat(paste0("R-squared: ", rsq_test, "\n"))
## R-squared: 0.358177370788151
#Analyzing with Random forest Regression with Train-Test and Validating the data:
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Split the data into training, validation, and testing sets
set.seed(123)
train_index <- sample(nrow(numeric_df), 0.75 * nrow(numeric_df))
val_index <- sample(setdiff(1:nrow(numeric_df), train_index), 0.2 * nrow(numeric_df))
test_index <- setdiff(setdiff(1:nrow(numeric_df), train_index), val_index)
x_train <- x[train_index, ]
y_train <- y[train_index]
x_val <- x[val_index, ]
y_val <- y[val_index]
x_test <- x[test_index, ]
y_test <- y[test_index]
# Fit random forest regression model on training set
model <- randomForest(y_train ~ ., data = data.frame(x_train, y_train))
# Use fitted model to make predictions on validation set
y_predicted_val <- predict(model, newdata = data.frame(x_val))
# Calculate R-squared on validation set
sst_val <- sum((y_val - mean(y_val))^2)
sse_val <- sum((y_predicted_val - y_val)^2)
rsq_val <- 1 - sse_val/sst_val
# Use fitted model to make predictions on test set
y_predicted_test <- predict(model, newdata = data.frame(x_test))
# Calculate R-squared on test set
sst_test <- sum((y_test - mean(y_test))^2)
sse_test <- sum((y_predicted_test - y_test)^2)
rsq_test <- 1 - sse_test/sst_test
# Print the R-squared values for test set
print(paste0("Test Set R-squared: ", rsq_test))
## [1] "Test Set R-squared: 0.369206316866739"
#Analyzing with SVM with Train-Test and Validating the data:
library(e1071)
# Define response variable
y <- numeric_df$Total_Lifetime_Giving
# Define matrix of predictor variables
x <- data.matrix(df2[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Last_10_Years_Giving','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
# Split the data into training, validation, and testing sets
set.seed(123)
train_index <- sample(nrow(numeric_df), 0.75 * nrow(numeric_df))
val_index <- sample(setdiff(1:nrow(numeric_df), train_index), 0.2 * nrow(numeric_df))
test_index <- setdiff(setdiff(1:nrow(numeric_df), train_index), val_index)
x_train <- x[train_index, ]
y_train <- y[train_index]
x_val <- x[val_index, ]
y_val <- y[val_index]
x_test <- x[test_index, ]
y_test <- y[test_index]
# Fit support vector regression model on training set with default hyperparameters
model <- svm(y_train ~ ., data = data.frame(x_train, y_train), kernel = "radial")
# Use validation set to tune hyperparameters
tuned_model <- tune.svm(y_train ~ ., data = data.frame(x_train, y_train), kernel = "radial", gamma = 10^(-6:1), cost = 10^(-1:2), tunecontrol = tune.control(cross = 2))
summary(tuned_model)
##
## Parameter tuning of 'svm':
##
## - sampling method: 2-fold cross validation
##
## - best parameters:
## gamma cost
## 0.1 10
##
## - best performance: 12748660349
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 1e-06 0.1 12750101515 3894746525
## 2 1e-05 0.1 12750100153 3894744768
## 3 1e-04 0.1 12749670276 3894864814
## 4 1e-03 0.1 12749645704 3895053113
## 5 1e-02 0.1 12749857221 3894855873
## 6 1e-01 0.1 12749735411 3895231495
## 7 1e+00 0.1 12749897209 3894993204
## 8 1e+01 0.1 12749809724 3894968427
## 9 1e-06 1.0 12750100018 3894744947
## 10 1e-05 1.0 12749611506 3894944453
## 11 1e-04 1.0 12749689822 3894926097
## 12 1e-03 1.0 12749757067 3894841483
## 13 1e-02 1.0 12749516370 3895008346
## 14 1e-01 1.0 12749920515 3896409468
## 15 1e+00 1.0 12749911132 3896780961
## 16 1e+01 1.0 12751529132 3896038710
## 17 1e-06 10.0 12749434664 3895208452
## 18 1e-05 10.0 12749727253 3895026400
## 19 1e-04 10.0 12749714278 3894886585
## 20 1e-03 10.0 12749666210 3894949280
## 21 1e-02 10.0 12750042864 3895955836
## 22 1e-01 10.0 12748660349 3897233244
## 23 1e+00 10.0 12751137266 3896695392
## 24 1e+01 10.0 12756238532 3893894592
## 25 1e-06 100.0 12749823678 3894865986
## 26 1e-05 100.0 12749800633 3894987331
## 27 1e-04 100.0 12749650795 3894965142
## 28 1e-03 100.0 12749524068 3894966449
## 29 1e-02 100.0 12751372092 3898442548
## 30 1e-01 100.0 12752780998 3892234372
## 31 1e+00 100.0 12776942958 3866406718
## 32 1e+01 100.0 12757790091 3895998057
best_gamma <- tuned_model$best.parameters$gamma
best_cost <- tuned_model$best.parameters$cost
# Fit support vector regression model on training set with tuned hyperparameters
final_model <- svm(y_train ~ ., data = data.frame(x_train, y_train), kernel = "radial", gamma = best_gamma, cost = best_cost)
# Use fitted model to make predictions on test set
y_predicted <- predict(final_model, newdata = data.frame(x_test))
# Calculate R-squared on test set
sst <- sum((y_test - mean(y_test))^2)
sse <- sum((y_predicted - y_test)^2)
rsq <- 1 - sse/sst
# Print the R-squared values
print(paste0("R-squared: ", rsq))
## [1] "R-squared: -0.0474218863771314"
#Filter data for price between 1,000 USD and 200,000 USD
df_filtered <- subset(numeric_df, Total_Lifetime_Giving >= 200000 & Total_Lifetime_Giving <= 5000000)
#Load the ggplot2 library
library(ggplot2)
#Create a box plot of the 'price' column of the filtered DataFrame
ggplot(df_filtered, aes(x = Total_Lifetime_Giving)) +
geom_boxplot() +
ggtitle("Price Range: $200000 to $5000000") +
theme(plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
#Create a R2 frame:
library(dplyr)
library(ggplot2)
# create a data frame for R-squared values
rsq_df <- data.frame(Model = c("Linear Regression","Ridge Regression","Lasso Regression","Decision Tree","Random Forest","SVM"),
R_Squared = c(0.1075281,0.055,0.128, 0.3581, 0.8424,0.70267))
# reorder the data frame by R-squared values in descending order
rsq_df <- rsq_df %>%
arrange(desc(R_Squared)) %>%
mutate(Model = factor(Model, levels = Model))
# create a bar plot for R-squared values with adjusted width
ggplot(rsq_df, aes(x = Model, y = R_Squared, width = 0.3)) +
geom_bar(stat = "identity", fill = "darkblue") +
labs(x = "Model", y = "R-Squared", title = "Comparison of R-Squared Values") +
theme(axis.text.x = element_text(angle = 0, vjust = 0.2, hjust = 0.5))